In this example, we would take up the binary classification task on possible configurations of tic-tac-toe game.
Dataset : https://archive.ics.uci.edu/ml/datasets/Tic-Tac-Toe+Endgame
This database encodes the complete set of possible board configurations at the end of tic-tac-toe games, where "x" is assumed to have played first. The target concept is "win for x" (i.e., true when "x" has one of 8 possible ways to create a "three-in-a-row").
Interestingly, this raw database gives a stripped-down decision tree algorithm (e.g., ID3) fits. However, the rule-based CN2 algorithm, the simple IB1 instance-based learning algorithm, and the CITRE feature-constructing decision tree algorithm perform well on it.
In [1]:
import os
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from time import time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score , classification_report
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report
In [12]:
# read .csv from provided dataset
csv_filename="tic-tac-toe.data"
# df=pd.read_csv(csv_filename,index_col=0)
df=pd.read_csv(csv_filename,
names=["TopLeft", "TopMiddle" , "TopRight" ,
"MiddleLeft", "MiddleMiddle" , "MiddleRight" ,
"BottomLeft", "BottomMiddle" , "BottomRight" , "Class" ])
In [13]:
df.head()
Out[13]:
In [14]:
df['Class'].unique()
Out[14]:
In [8]:
#Convert animal labels to numbers
le = preprocessing.LabelEncoder()
for col in df.columns:
df[col] = le.fit_transform(df[col])
In [9]:
df.head()
Out[9]:
In [15]:
for col in df.columns:
df[col] = pd.get_dummies(df[col])
In [16]:
df.head()
Out[16]:
In [19]:
features=(list(df.columns[:-1]))
In [20]:
features
Out[20]:
In [21]:
X = df[features]
y = df['Class']
In [22]:
X.head()
Out[22]:
In [23]:
# split dataset to 60% training and 40% testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
In [24]:
print X_train.shape, y_train.shape
In [25]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier
# Build a classification task using 3 informative features
# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
random_state=0)
forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(X.shape[1]):
print("%d. feature %d - %s (%f) " % (f + 1, indices[f], features[indices[f]], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure(num=None, figsize=(14, 10), dpi=80, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
In [26]:
importances[indices[:5]]
Out[26]:
In [27]:
for f in range(5):
print("%d. feature %d - %s (%f)" % (f + 1, indices[f], features[indices[f]] ,importances[indices[f]]))
In [28]:
best_features = []
for i in indices[:5]:
best_features.append(features[i])
In [29]:
# Plot the top 5 feature importances of the forest
plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(5), importances[indices][:5],
color="r", yerr=std[indices][:5], align="center")
plt.xticks(range(5), best_features)
plt.xlim([-1, 5])
plt.show()
In [30]:
t0=time()
print "DecisionTree"
dt = DecisionTreeClassifier(min_samples_split=20,random_state=99)
# dt = DecisionTreeClassifier(min_samples_split=20,max_depth=5,random_state=99)
clf_dt=dt.fit(X_train,y_train)
print "Acurracy: ", clf_dt.score(X_test,y_test)
t1=time()
print "time elapsed: ", t1-t0
In [31]:
tt0=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X, y, cv=3)
print scores
print scores.mean()
tt1=time()
print "time elapsed: ", tt1-tt0
print "\n"
In [44]:
from sklearn.metrics import classification_report
pipeline = Pipeline([
('clf', DecisionTreeClassifier(criterion='entropy'))
])
parameters = {
'clf__max_depth': (5, 25 , 50),
'clf__min_samples_split': (1, 5, 10),
'clf__min_samples_leaf': (1, 2, 3)
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')
grid_search.fit(X_train, y_train)
print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print '\t%s: %r' % (param_name, best_parameters[param_name])
predictions = grid_search.predict(X_test)
print classification_report(y_test, predictions)
In [47]:
t0=time()
print "DecisionTree"
dt1 = DecisionTreeClassifier(min_samples_split=1,max_depth=25,min_samples_leaf=1, random_state=99)
clf_dt1=dt1.fit(X_train,y_train)
print "Acurracy: ", clf_dt1.score(X_test,y_test)
t1=time()
print "time elapsed: ", t1-t0
export_graphviz(clf_dt1,
out_file='tree.dot',
feature_names=features)
After we have installed GraphViz on our computer, we can convert the tree.dot file into a PNG file by executing the following command from the command line in the location where we saved the tree.dot file:
dot -Tpng tree.dot -o tree.png
In [33]:
t2=time()
print "RandomForest"
rf = RandomForestClassifier(n_estimators=100,n_jobs=-1)
clf_rf = rf.fit(X_train,y_train)
print "Acurracy: ", clf_rf.score(X_test,y_test)
t3=time()
print "time elapsed: ", t3-t2
In [34]:
tt2=time()
print "cross result========"
scores = cross_validation.cross_val_score(rf, X, y, cv=3)
print scores
print scores.mean()
tt3=time()
print "time elapsed: ", tt3-tt2
print "\n"
In [48]:
pipeline2 = Pipeline([
('clf', RandomForestClassifier(criterion='entropy'))
])
parameters = {
'clf__n_estimators': (5, 25, 50, 100),
'clf__max_depth': (5, 25 , 50),
'clf__min_samples_split': (1, 5, 10),
'clf__min_samples_leaf': (1, 2, 3)
}
grid_search = GridSearchCV(pipeline2, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)
grid_search.fit(X_train, y_train)
print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print '\t%s: %r' % (param_name, best_parameters[param_name])
predictions = grid_search.predict(X_test)
print 'Accuracy:', accuracy_score(y_test, predictions)
print classification_report(y_test, predictions)
In [35]:
t4=time()
print "NaiveBayes"
nb = BernoulliNB()
clf_nb=nb.fit(X_train,y_train)
print "Acurracy: ", clf_nb.score(X_test,y_test)
t5=time()
print "time elapsed: ", t5-t4
In [36]:
tt4=time()
print "cross result========"
scores = cross_validation.cross_val_score(nb, X,y, cv=3)
print scores
print scores.mean()
tt5=time()
print "time elapsed: ", tt5-tt4
print "\n"
In [37]:
t6=time()
print "KNN"
# knn = KNeighborsClassifier(n_neighbors=3)
knn = KNeighborsClassifier(n_neighbors=3)
clf_knn=knn.fit(X_train, y_train)
print "Acurracy: ", clf_knn.score(X_test,y_test)
t7=time()
print "time elapsed: ", t7-t6
In [38]:
tt6=time()
print "cross result========"
scores = cross_validation.cross_val_score(knn, X,y, cv=5)
print scores
print scores.mean()
tt7=time()
print "time elapsed: ", tt7-tt6
print "\n"
In [92]:
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search
knn = KNeighborsClassifier()
parameters = {'n_neighbors':[1,10]}
grid = grid_search.GridSearchCV(knn, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
grid.fit(X_train, y_train)
print 'Best score: %0.3f' % grid.best_score_
print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print '\t%s: %r' % (param_name, best_parameters[param_name])
predictions = grid.predict(X_test)
print classification_report(y_test, predictions)
In [39]:
t7=time()
print "SVM"
svc = SVC()
clf_svc=svc.fit(X_train, y_train)
print "Acurracy: ", clf_svc.score(X_test,y_test)
t8=time()
print "time elapsed: ", t8-t7
In [41]:
tt7=time()
print "cross result========"
scores = cross_validation.cross_val_score(svc,X,y, cv=5)
print scores
print scores.mean()
tt8=time()
print "time elapsed: ", tt7-tt6
print "\n"
In [42]:
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search
svc = SVC()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
grid = grid_search.GridSearchCV(svc, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
grid.fit(X_train, y_train)
print 'Best score: %0.3f' % grid.best_score_
print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print '\t%s: %r' % (param_name, best_parameters[param_name])
predictions = grid.predict(X_test)
print classification_report(y_test, predictions)
In [43]:
pipeline = Pipeline([
('clf', SVC(kernel='linear', gamma=0.01, C=10))
])
parameters = {
'clf__gamma': (0.01, 0.03, 0.1, 0.3, 1),
'clf__C': (0.1, 0.3, 1, 3, 10, 30),
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search.fit(X_train, y_train)
print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print '\t%s: %r' % (param_name, best_parameters[param_name])
predictions = grid_search.predict(X_test)
print classification_report(y_test, predictions)